from pathlib import Path
import numpy as np
import pandas as pd
from IPython.display import display
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
def make_unique(data: list, sep='-'):
"""Take in a list of strings and makes any duplicate strings unqiue.
It does this by appending the occurence count of each duplicate string to itself with the 'sep' arg if provided.
ex.)
x = ['a', 'a', 'a', 'b', 'c']
out = make_unique(x)
print(out) # ['a', 'a-1', 'a-2', 'b', 'c']
Args:
data (list): an iterable that can be interpreted as a list with all elements being strings
sep (str, optional): what to use to separate the appened unique counts with. Defaults to '-'.
Returns:
list: new list with all unique elements
"""
new_list = list(data).copy()
unique_values = np.unique(new_list)
unique_counts = { val: 0 for val in unique_values }
for ind, value in enumerate(new_list):
if unique_counts[value] != 0:
new_list[ind] = f"{value}{sep}{unique_counts[value]}"
unique_counts[value] = unique_counts[value] + 1
return new_list
input_file = Path('scRepertoire_outs/scaledClonotypeAbundance.csv')
fasta_file = Path('py-testing/scaledClonotypeAbundance.fa')
# read csv
file_df = pd.read_csv(input_file)
# separate 'CTstrict' into 'alpha regions', 'alpha chain', 'beta region', 'beta chain'
file_df[['alpha region', 'alpha chain', 'beta region', 'beta chain']] = file_df['CTstrict'].str.split('_', 4, expand=True)
# combine 'alpha chain' and 'beta chain'
file_df['seq'] = file_df['alpha chain'] + file_df['beta chain']
# save TCR info for the id of SeqRecord
file_df['desc'] = file_df[['alpha region', 'beta region']].agg('-'.join, axis=1)
# make 'values' unique for SeqRecord name
file_df['id'] = make_unique(file_df['values'], '|')
# make into fasta file
file_df['seq'] = file_df['seq'].apply(Seq)
fasta_data = file_df.apply(lambda row: SeqRecord(row['seq'], id=row['id'], description=row['desc']), axis=1)
fasta_data = fasta_data.tolist()
# save fasta file
SeqIO.write(fasta_data, fasta_file, 'fasta')
4293
len(list(SeqIO.parse(fasta_file, 'fasta')))
4293
# run from command align after downloading clustalo-1.2.4
"""
cd /mnt/c/Users/Mathew/My\ Drive/LauraRogers/TCR_Repertoire/py-testing/
~/./clustalo-1.2.4 --iter 10 -v --force -i scaledClonotypeAbundance.fa -o scaledClonotypeAbundance.clustalo.aln-iter_10.fa --threads 8
"""
from Bio import AlignIO
file_in = "py-testing/scaledClonotypeAbundance.clustalo-iter_10.aln.fa"
aligned_fasta_file = Path(file_in)
# read in aligned TCRs
aligned_fasta = AlignIO.read(aligned_fasta_file, 'fasta')
print(aligned_fasta)
Alignment with 4293 rows and 117 columns ---------------TGTGCTGTGAGGGAGAGGGGTTCTGGGAC...--- TMEV_pilot_CD45 ---------------TGTGCTGTGAGGGATAGAAATTCTGGGAC...--- TMEV_pilot_CD45|1 ---------------TGTGCTGTGAGGGATCGGAATTCTGGGAC...--- TMEV_pilot_CD45|2 ---------------TGTGCTGTGAGGGTAACTTCAAGTGGCCA...--- TMEV_pilot_CD45|3 ---------------TGTGCTGTGAGGGTAACTTCAAGTGGCCA...--- TMEV_pilot_CD45|4 ---------TGTGCTGTGAGGGATAGGACTAACAGTGCAGGGAA...--- TMEV_pilot_CD45|5 ---------TGTGCTGTGAGGGATCGTACTAACAGTGCAGGGAA...--- TMEV_pilot_CD45|6 ---------------TGTGCTGTGAATAGAGGTTCAGCCTTAGG...--- TMEV_pilot_CD45|7 ------------TGTGCTGTGAGGGACAGAGGTTCAGCCTTAGG...--- TMEV_pilot_CD45|8 ------------TGTGCTGTGAGGGACAGAGGTTCAGCCTTAGG...--- TMEV_pilot_CD45|9 ------------TGTGCTGTGAGGGATCGGGGTTCAGCCTTAGG...--- TMEV_pilot_CD45|10 ------------TGTGCTGTGAGGGATCGGGGTTCAGCCTTAGG...--- TMEV_pilot_CD45|11 ---------------TGTGCTGTGAGGGATCGGTCTAATTACAA...--- TMEV_pilot_CD45|12 ---------------TGTGCTGTGAGGGGGATGTCTAATTACAA...--- TMEV_pilot_CD45|13 ---------------TGTGCTGTGAGGGATCGCTCTGGCAGCTG...--- TMEV_pilot_CD45|14 ---------------TGTGCTGTGAGGGATCGCTCTGGCAGCTG...--- TMEV_pilot_CD45|15 ---------------------TGTGCTGTGAGGGAGGGCCCGGG...--- TMEV_pilot_CD45|16 ------------------TGTGCTCCCCGGGACACAAATGCTTA...--- TMEV_pilot_CD45|17 ... ------------TGTGCTCTGGGGGGGCCTGGAGCTAACACTGG...--- Inhibitor_Ctrl|766
"""
cd /mnt/c/Users/Mathew/My\ Drive/LauraRogers/TCR_Repertoire/py-testing/
~/./FastTree -nt scaledClonotypeAbundance.clustalo-iter_10.aln.fa > clustalo-iter_10-tree.nwk
"""
from Bio import Phylo
import seaborn as sns
sns.set_theme()
sns.set(rc={"figure.figsize":(16, 12)})
tree_file = Path("py-testing/tree.nwk")
tree = Phylo.read(tree_file, format='newick')
Phylo.draw(tree)
import pylab
import networkx
net = Phylo.to_networkx(tree)
print(type(net))
networkx.write_graphml(net, "py-testing/tree.nwk.cytoscape.graphml")
networkx.draw(net)
pylab.show()
<class 'networkx.classes.graph.Graph'>
I believe that these sequences are so closely related, it makes them difficult to plot and especially so for a large number of samples
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor, DistanceMatrix
from Bio import Phylo
# calculator = DistanceCalculator('identity')
# dm = calculator.get_distance(aligned_fasta)
# constructor = DistanceTreeConstructor(calculator, 'nj')
# tree = constructor.build_tree(aligned_fasta)
# print(tree)
# Phylo.write(tree, "biopython-nj.nwk", "newick")
takes way too long to do it all in biopython
So I'll try creating a distance matrix of the TCR reads and then clustering into a dendrogram
# or use a different distance matrix using Levenshtein
from Levenshtein import ratio as lev_ratio
from Levenshtein import distance as lev_dist
from scipy.spatial.distance import pdist, squareform
seqs = [str(x.seq) for x in fasta_data]
seqs = np.array(seqs).reshape(-1, 1)
seqs_id = [str(x.id) for x in fasta_data]
tcr_names_unique = np.array(seqs_id)
tcr_names_general = np.array([ x.split("|")[0] for x in seqs_id] )
def dist_lev(u, v):
return lev_dist(u[0], v[0])
def ratio_lev(u, v):
return lev_ratio(u[0], v[0])
dist_lev_flat = pdist(seqs, dist_lev)
ratio_lev_flat = pdist(seqs, ratio_lev)
dist_lev_square = squareform(dist_lev_flat)
ratio_lev_square = squareform(ratio_lev_flat)
even with distance matrix already made, this also takes crazy long
# lev_square = squareform(lev_flat)
# ind_tril = np.tril_indices(len(lev_square))
# lev_tril_flat = lev_square[ind_tril]
# dm_input = [ [] for x in range(len(lev_square)) ]
# ind = 0
# for i in range(len(lev_square)):
# for j in range(i + 1):
# dm_input[i].append(lev_tril_flat[ind])
# ind = ind + 1
# # stupid how it wants an inefficient list of lists...just use numpy array with tril_indices like I did!
# # now I know why this takes so long to run the DistanceCalculator and DistanceTreeConstructor
# # I should really think about forking and pull requesting a better version for Biopython
# lev_dm = DistanceMatrix(names=seqs_id, matrix=dm_input)
# constructor = DistanceTreeConstructor()
# # this also takes crazy long
# tree = constructor.nj(lev_dm)
# print(tree)
I'm finding it hard to visualize such a large group (4000+) of TCRs with the classic dendrogram.
I'll instead cluster the levenshtein distances from the TCR reads and then visualize
But first visualize the distance matrix
unique_counts = np.array(np.unique(tcr_names_general, return_counts=True)).T
# ordered as TMEV_pilot_CD45, TMEV_pilot_CD4/CD8, Inhibitor_Aak1i, Inhibitor_Ctrl
print(unique_counts[[3, 2, 0, 1]])
sns.heatmap(dist_lev_square)
[['TMEV_pilot_CD45' '1122'] ['TMEV_pilot_CD4/CD8' '1651'] ['Inhibitor_Aak1i' '753'] ['Inhibitor_Ctrl' '767']]
<AxesSubplot:>
Perform ward linkage of levenshtein distances as 'Z' (such is the scipy convention)
Then hierarchical clustering of ward linkage so that there are only 4 clusters created.
4 is used since that is how many types we have, however, 5 wouldn't be too bad so that there is an "other" class for outliers
from scipy.cluster import hierarchy
import seaborn as sns
n_cluster = 3
unique_counts = np.array(np.unique(tcr_names_general, return_counts=True)).T
# ordered as TMEV_pilot_CD45, TMEV_pilot_CD4/CD8, Inhibitor_Aak1i, Inhibitor_Ctrl
print('number of TCR types')
print(unique_counts[[3, 2, 0, 1]])
# could be an issue using ward clustering on levenshtein ratios, maybe distances is ok? (not euclidien)
Z = hierarchy.ward(dist_lev_flat)
cluster_assn = hierarchy.fcluster(Z, t=n_cluster, criterion='maxclust')
sns.set_theme()
tcr_clust = pd.DataFrame({'samples': tcr_names_general, 'cluster': cluster_assn})
g = sns.countplot(data=tcr_clust, x='samples', hue='cluster')
# output the cluster assignments
tcr_clust2 = pd.DataFrame({'ID': tcr_names_unique, 'cluster': cluster_assn})
tcr_clust2.to_csv(f"py-testing/cluster-{n_cluster}-assignment.gff", index=False, header=False, sep='=')
small_clust = tcr_clust2[tcr_clust2['cluster'] == 2]
small_clust.to_csv(f"py-testing/cluster-small.csv", index=False, header=False)
number of TCR types [['TMEV_pilot_CD45' '1122'] ['TMEV_pilot_CD4/CD8' '1651'] ['Inhibitor_Aak1i' '753'] ['Inhibitor_Ctrl' '767']]
In general, for total cluster sizes between 3-5 results in a similar distribution. There are always two major clusters, and after that are simply minor clusters
clustering with 6 and above ends up splitting the larger two clusters into 3,
sns.set_theme()
sns.clustermap(dist_lev_square, row_linkage=Z, col_linkage=Z, figsize=(16, 14))
<seaborn.matrix.ClusterGrid at 0x2587e36e620>
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_r = pca.fit(dist_lev_square).transform(dist_lev_square)
sns.set_theme()
sns.set(rc={"figure.figsize":(16, 14)})
g = sns.scatterplot(x=X_r[:,0], y=X_r[:,1], hue=tcr_names_general)
g.set_ylabel(f'PC2 ({round(pca.explained_variance_ratio_[1], 4)})')
g.set_xlabel(f'PC1 ({round(pca.explained_variance_ratio_[0], 4)})')
Text(0.5, 0, 'PC1 (0.2166)')